#!/usr/bin/env python
# -*- coding: utf-8 -*-

import MySQLdb
import re
from stemmer import Stemmer
import sys

# removing tags
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
    return TAG_RE.sub('', text)

reload(sys)
sys.setdefaultencoding("utf-8")

db = MySQLdb.connect(host="localhost", user="root", passwd="", db="wikiubb", charset='utf8')
cursor = db.cursor()
stemmerObj = Stemmer()

sql = "SELECT COUNT(*) FROM text"
cursor.execute(sql);
result = cursor.fetchone()
#number_of_rows = result[0]
number_of_rows = 1000

iLimit = 50
offset = 0

it = 0
wordsSet = set()
while (offset < number_of_rows):
    sql = "SELECT * FROM text LIMIT %s OFFSET %s"   
    
    try :
        cursor.execute(sql, (iLimit, offset))
        results = cursor.fetchall()
        for doc in results:
        
            final_doc = stemmerObj.stemming(doc[1])
            for stemWord in final_doc:
                wordsSet.add(stemWord)
                
#             stem_text = " ".join(final_doc)
            
            # Save into DB
            #sql = 'UPDATE text SET stem_text=%s WHERE old_id=%s'
            #cursor.execute(sql, (stem_text, doc[0])) 
                 
            it = it + 1
            print it
    
        offset = offset + iLimit 
    except Exception as e:
        print "Error: unable to fetch data", e
        
# Insert into Words table
print "Inserting words..."
for word1 in wordsSet:
    wordWithQoute = "'" + word1 + "'"
    sql = 'INSERT INTO words (word) VALUES (%s)' % (wordWithQoute)
    cursor.execute(sql)
#     print sql
db.commit()
print "Finished inserting words."
 
cursor.close()
del cursor
db.close() 